knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=14,
fig.height=14)
options(readr.show_progress = FALSE,
digits=5,
scipen=8)
set.seed(42)
library(tidyverse)
tx_gene_meta <- read_csv("./data/hg_ens97_metadata.txt")
tx_gene_meta
gene_meta <-
tx_gene_meta %>%
dplyr::group_by(gene_id_version) %>%
dplyr::slice(1) %>%
ungroup() %>%
select(gene_name, gene_len, perc_gene_gc, gene_id, seqnames, gene_biotype, gene_id_version)
gene_meta
Some gene names are not unique
duplicated_gene_names <-
gene_meta %>%
dplyr::filter( duplicated(gene_name))%>%
pull(gene_name)%>%
unique()
length(duplicated_gene_names)
[1] 147
gene_meta %>%
dplyr::filter(gene_name%in% duplicated_gene_names ) %>%
arrange(gene_name)
biotype_count <-
gene_meta %>%
group_by(gene_biotype) %>%
tally(sort = TRUE)
biotype_count
biotype_targets <-
biotype_count %>%
slice(1:10)%>%
pull(gene_biotype)
ggplot(gene_meta %>%
filter(gene_biotype %in% biotype_targets),
aes(perc_gene_gc)) +
geom_histogram(bins = 150) +
facet_wrap(~gene_biotype)
ggplot(gene_meta %>%
filter(gene_biotype %in% biotype_targets) ,
aes(gene_len)) +
geom_histogram(bins = 150) +
scale_x_log10() +
facet_wrap(~gene_biotype)
ggplot(gene_meta %>%
filter(gene_biotype %in% biotype_targets),
aes(perc_gene_gc, gene_len))+
geom_point(size=0.8,
alpha=0.7) +
geom_density_2d() +
scale_y_log10() +
theme(legend.position="none") +
facet_wrap(~gene_biotype)
tx_biotype_count <-
tx_gene_meta %>%
group_by(tx_biotype) %>%
tally(sort = TRUE)
tx_biotype_count
tx_biotype_targets <-
tx_biotype_count %>%
slice(1:8)%>%
pull(tx_biotype)
ggplot(tx_gene_meta %>%
filter(tx_biotype %in% tx_biotype_targets)) +
geom_point( aes(exonic_len, intronic_len, colour=perc_gene_gc),
size=0.3,
alpha=0.6) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype) +
scale_colour_gradient2(midpoint=10, mid = "yellow")
ggplot(tx_gene_meta %>%
filter(tx_biotype %in% tx_biotype_targets)) +
geom_point( aes(nexon, nintron, colour=gene_len),
size=0.8,
alpha=0.7) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype)
ggplot(tx_gene_meta %>%
filter(tx_biotype %in% tx_biotype_targets & nexon!=1),
aes( premrna_len, exonic_len, colour=nexon)) +
geom_point( size=0.1,
alpha=0.3) +
geom_density_2d() +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype)
ggplot(tx_gene_meta %>%
filter(tx_biotype %in% tx_biotype_targets ),
aes( nexon, exonic_len, colour=premrna_len)) +
geom_point( size=0.1,
alpha=0.3) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype)
count_tx_by_gene <-
tx_gene_meta %>%
count(gene_id,gene_name, gene_biotype) %>%
arrange(-n)
count_tx_by_gene
ggplot(count_tx_by_gene, aes(n)) +
geom_histogram(bins = 100)
top_genes <-
count_tx_by_gene %>%
slice(1:20) %>%
pull(gene_name)
top_genes
[1] "PCBP1-AS1" "MAPK10" "TEX41" "PVT1" "LINC00635" "MIR663AHG" "MIR99AHG" "SNHG14" "MUC20-OT1" "LINC00343" "LINC00293" "HULC" "LINC00511"
[14] "CASC15" "SOX2-OT" "LINC01206" "SGCE" "SPG7" "KCNMA1" "TCF4"
ggplot(tx_gene_meta %>%
dplyr::filter(gene_name%in% top_genes))+
geom_point( aes(exonic_len, intronic_len, colour=nexon >=10)) +
facet_wrap(gene_biotype~gene_name, scales = "free")
tx_10_genes <-
count_tx_by_gene %>%
filter(n==10) %>%
slice(1:20) %>%
pull(gene_name)
ggplot(tx_gene_meta %>%
dplyr::filter(gene_name%in% tx_10_genes))+
geom_point( aes(exonic_len, intronic_len, colour=nexon >=10)) +
facet_wrap(~gene_name, scales = "free")
NA
single_transcript_genes <-
count_tx_by_gene %>%
dplyr::filter(n==1) %>%
pull(gene_name)
length(single_transcript_genes)
[1] 37383
ggplot(tx_gene_meta %>%
dplyr::filter(gene_name%in% single_transcript_genes & gene_biotype %in% biotype_targets ))+
geom_point( aes(exonic_len, intronic_len, colour=perc_gene_gc),
size=0.2,
alpha=0.5) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype) +
scale_colour_gradient2(midpoint=10, mid = "yellow")
tx_gene_meta %>%
count(nexon ) %>%
arrange(-n)
ggplot(tx_gene_meta %>%
dplyr::filter(tx_biotype %in% tx_biotype_targets & nexon== 2)) +
geom_point( aes(exonic_len, intronic_len, colour=perc_gene_gc),
size=0.2,
alpha=0.5) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype) +
scale_colour_gradient2(midpoint=10, mid = "yellow")
ggplot(tx_gene_meta %>%
dplyr::filter(tx_biotype %in% tx_biotype_targets & nexon< 10 & nexon> 1)) +
geom_point( aes(exonic_len, intronic_len, colour=perc_gene_gc),
size=0.2,
alpha=0.5) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype) +
scale_colour_gradient2(midpoint=10, mid = "yellow")
ggplot(tx_gene_meta %>%
dplyr::filter(tx_biotype %in% tx_biotype_targets & nexon>= 10)) +
geom_point( aes(exonic_len, intronic_len, colour=perc_gene_gc),
size=0.2,
alpha=0.5) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype) +
scale_colour_gradient2(midpoint=10, mid = "yellow")
sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2 readr_1.3.1 tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.1 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.2 cellranger_1.1.0 pillar_1.4.2 compiler_3.6.1 tools_3.6.1 digest_0.6.20 zeallot_0.1.0 lubridate_1.7.4 jsonlite_1.6
[10] nlme_3.1-140 gtable_0.3.0 lattice_0.20-38 pkgconfig_2.0.2 rlang_0.4.0 cli_1.1.0 rstudioapi_0.10 haven_2.1.1 xfun_0.8
[19] withr_2.1.2 xml2_1.2.2 httr_1.4.1 knitr_1.24 generics_0.0.2 vctrs_0.2.0 hms_0.5.0 rprojroot_1.3-2 grid_3.6.1
[28] tidyselect_0.2.5 glue_1.3.1 R6_2.4.0 readxl_1.3.1 modelr_0.1.5 magrittr_1.5 MASS_7.3-51.4 backports_1.1.4 scales_1.0.0
[37] rvest_0.3.4 assertthat_0.2.1 colorspace_1.4-1 labeling_0.3 stringi_1.4.3 lazyeval_0.2.2 munsell_0.5.0 broom_0.5.2 crayon_1.3.4